# coding:utf-8
from pyspark import SparkConf, SparkContext
import os

os.environ['JAVA_HOME'] = '/server/jdk'

if __name__ == '__main__':
    conf = SparkConf().setAppName('test').setAppName('local[*]')
    sc = SparkContext(conf=conf)

    rdd1 = sc.parallelize([1,1,1,2,2,2,3,3,3])
    rdd2 = sc.parallelize([('a',1),('a',1),('a',3)])

    # distinct 进行去重操作(不管元组还是字符串类型，都能去重)
    print(rdd1.distinct().collect())
    print(rdd2.distinct().collect())